
This notebook seeks to identify health trends from the Big Cities Health Inventory (BCHI). The BCHI is a compilation of health data spanning 53 indicators across 27 major US cities. The original dataset contains 17 variables and 18329 records collected from various sources. The indicators are broken down into 11 categories, including:
During this exploration, we will reshape the data into 58 variables across 1814 records to allow for multivariate analysis.
# Data manipulation
import pandas as pd
pd.options.display.max_columns = 60
import numpy as np
from IPython.display import display
# Visualizations
import missingno as msno
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid", {'axes.grid': False})
# Machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
The following creates a small sample file that can be opened easily in Excel.
import unicodecsv
with open('data/bchi.csv', 'rb') as fin:
freader = unicodecsv.reader(fin, delimiter=',')
with open('data/bchi_sample.csv', 'wb') as fout:
fwriter = unicodecsv.writer(fout, delimiter=',')
count = 0
for row in freader:
fwriter.writerow(row)
count += 1
if count==250:
break
health = pd.read_csv('data/bchi.csv')
health_df = pd.DataFrame(health)
location_df = health_df.copy()
location_df.loc[:,'City'] = location_df['Place'].str.split(', ').str.get(0)
location_df.loc[:,'State'] = location_df['Place']
location_df.loc[location_df['State']!='U.S. Total','State'] = location_df['Place'].str.split(', ').str.get(1)
location_df.to_csv('data/bhmi_locations.csv')
health_df.shape
health_df.info()
health_df.describe(include=['O'])
Hmm, it looks like some of the floats are being cast as objects. I'll have to clean those up in the next section.
health_df.describe(include=['int64'])
health_df.head()
health_df.tail()
len(health_df[health_df.isnull().any(axis=1)])
Looks like all 18,329 rows have at least one column with a missing value.
msno.matrix(health_df)
msno.bar(health_df)
msno.heatmap(health_df)
indicator_categories = health_df['Indicator Category'].value_counts()
x = indicator_categories.index
y = indicator_categories.values
plt.figure(figsize=(12,8))
plt.bar(range(len(x)), y)
plt.xticks(range(len(x)), x, rotation=60, ha='right')
plt.title('Indicator Category')
plt.show()
categories = set(health_df['Indicator Category'])
for category in categories:
category_counts = health_df[health_df['Indicator Category'] == category]['Indicator'].value_counts()
x = category_counts.index
y = category_counts.values
plt.figure(figsize=(12,8))
plt.bar(range(len(x)), y)
plt.xticks(range(len(x)), x, rotation=60, ha='right')
plt.title(category)
plt.show()
object_cols = health_df.select_dtypes(include=['object']).columns
for object_col in object_cols:
health_df[object_col] = health_df[object_col].str.rstrip()
health_df.Year = pd.to_datetime(health_df.Year, format='%Y')
float_cols = [
'Value',
'90% Confidence Level - Low',
'90% Confidence Level - High',
'95% Confidence Level - Low',
'95% Confidence Level - High'
]
for float_col in float_cols:
health_df[float_col] = health_df[float_col].str.replace(',', '')
health_df[float_col] = health_df[float_col].str.replace('\xc2\xa0', '')
health_df[float_col] = pd.to_numeric(health_df[float_col])
health_df.describe()
Minor annoyance
health_df.rename(columns ={'Place': 'Location'}, inplace=True)
It's nice that the data started off in tidy format, but it makes it harder to compare variables using multivariate analyses. We'll have to reshape the data so that each indicator has it's own column. This will also give us a better understanding of the sparseness of the dataset.
health_reshaped = health_df.pivot_table(
index=['Location', 'Year', 'Sex', 'Race/Ethnicity'],
columns='Indicator',
values='Value'
)
health_reshaped.reset_index(inplace=True)
health_reshaped.columns.name = None
health_reshaped.loc[:,'City'] = health_reshaped['Location'].str.split(', ').str.get(0)
health_reshaped.loc[:,'State'] = health_reshaped['Location']
health_reshaped.loc[health_reshaped['State']!='U.S. Total','State'] = health_reshaped['Location'].str.split(', ').str.get(1)
del health_reshaped['Location']
cols = health_reshaped.columns.tolist()
col_order = cols[-2:] + cols[:-2]
health_reshaped = health_reshaped[col_order]
health_reshaped.head()
As expected, there are a lot of null values throughout. Each study was carried out by a different agency and pieced together, so while there may be some overlap, it's not a guarantee. This will make modeling difficult. We may have to just stick with visualizations.
health_reshaped.to_csv('data/bhmi_reshaped.csv')
Get indicator name and category value counts.
msno.matrix(health_reshaped)
msno.bar(health_reshaped)
msno.heatmap(health_reshaped)
Look at violin plots for chronic disease indicators by location.
Compare locations based on:
Need to drop outliers before creating correlation matrix. For loop?
health_corr_mat = health_reshaped[health_reshaped['Rate of Laboratory Confirmed Infections Caused by Shiga Toxin-Producing E-Coli (Per 100,000 people)']<200].corr()
plt.figure(figsize=(12,8))
sns.heatmap(health_corr_mat)
plt.show()
Kendall's seems to be more robust to outliers.
health_corr_mat = health_reshaped.corr(method='kendall')
plt.figure(figsize=(12,8))
sns.heatmap(health_corr_mat)
plt.show()
axs = pd.plotting.scatter_matrix(health_reshaped, figsize=(20,16))
n = 53
for x in range(n):
for y in range(n):
# to get the axis of subplots
ax = axs[x, y]
# to make x axis name vertical
ax.xaxis.label.set_rotation(90)
# to make y axis name horizontal
ax.yaxis.label.set_rotation(0)
# to make sure y axis names are outside the plot area
ax.yaxis.label.set_ha('right')
The goal's pretty simple: investigate any relationships with strong positive or negative correlations.
sns.lmplot(
x='Percent of Adults Who Received Seasonal Flu Shot',
y='Percent of High School Graduates (Over Age 18)',
data=health_reshaped,
size=8,
line_kws={'color': 'red'},
scatter_kws={'color': 'red'}
)
plt.show()
sns.lmplot(
x='Percent of Households Whose Housing Costs Exceed 35% of Income',
y='Percent of High School Students Who Currently Smoke',
data=health_reshaped,
size=8,
line_kws={'color': 'green'},
scatter_kws={'color': 'green'}
)
plt.show()
Let's examine the distributions of chronic diseases and how they interact with other variables.
health_reshaped.describe()
chronic_diseases = set(health_df[health_df['Indicator Category']=='Chronic Disease'].Indicator)
for chronic_disease in chronic_diseases:
plt.figure(figsize=(12,8))
sns.distplot(health_reshaped[chronic_disease].dropna(), bins=30)
plt.show()
plt.figure(figsize=(12,8))
sns.violinplot(
data=health_reshaped,
x='Race/Ethnicity',
y='Percent of High School Students Who Currently Smoke'
)
plt.xticks(rotation=60, ha='right')
plt.show()
plt.figure(figsize=(12,8))
sns.violinplot(
x='Race/Ethnicity',
y='Percent of Adults Who Binge Drank',
data=health_reshaped[health_reshaped['Race/Ethnicity']!='All'],
)
plt.xticks(rotation=60, ha='right')
plt.show()
#plt.figure(figsize=(12,8))
sns.lmplot(
x='Firearm-Related Mortality Rate (Age-Adjusted; Per 100,000 people)',
y='Homicide Rate (Age-Adjusted; Per 100,000 people)',
data=health_reshaped[health_reshaped['Race/Ethnicity']!='All'],
hue='State',
size=8,
ci=None,
fit_reg=False
)
plt.show()
sns.lmplot(
x='Median Household Income (Dollars)',
y='Life Expectancy at Birth (Years)',
data=health_reshaped,
size=8,
)
plt.show()
plt.figure(figsize=(12,8))
sns.violinplot(
data=health_reshaped,
x='Race/Ethnicity',
y='Life Expectancy at Birth (Years)'
)
plt.xticks(rotation=60, ha='right')
plt.show()
plt.figure(figsize=(12,8))
sns.violinplot(
data=health_reshaped[(health_reshaped['Sex']!='Both') & (health_reshaped['Race/Ethnicity']=='All')],
x='Race/Ethnicity',
y='Life Expectancy at Birth (Years)',
hue='Sex',
split=True
)
plt.xticks(rotation=60, ha='right')
plt.show()